The Marathon Data

Intro to R and Some Descriptive Analyses

Author

Alessio Crippa

Published

September 22, 2022

Objective

This report contains code for an introduction to R and how to perform common descriptive analyses, both uni- and multi-variate, using the marathon data, which was use in the following publication Hyponatremia among Runners in the Boston Marathon, N Engl J Med 2005;352:1550-6 .

Descriptive abstract
Hyponatremia has emerged as an important cause of race-related death and life-threatening illness among marathon runners. We studied a cohort of marathon runners to estimate the incidence of hyponatremia and to identify the principal risk factors.

Load R packages

The following packages are used in the tutorial. If you want to reproduce the code chunks below, be sure to install them (e.g. install.packages("tidyverse")).

Read, explore and manipulate data

Read data

The marathon data are available in a csv format at http://alecri.github.io/downloads/data/marathon.csv .

marathon <- read.csv(here("data", "raw", "marathon_raw.csv"))
gt(head(marathon)) %>% tab_options(table.font.size = px(12))
id na age urinat3p prewt postwt height runtime trainpace prevmara fluidint waterload nsaid sex
1 138 24.20534 2 NA NA 1.72720 NA 480 3 1 2 2 2
2 142 44.28200 1 NA NA NA 161 430 40 1 2 2 1
3 151 41.96304 1 NA NA NA 156 360 40 2 NA NA 1
4 139 28.19713 2 NA NA 1.72720 346 630 1 1 2 1 1
5 145 30.18207 1 NA 50.68182 NA 185 NA 3 1 2 2 2
6 140 28.29295 1 NA 55.68182 1.60655 233 NA 25 2 2 1 2

Many other options exist, such as:

  • basic functions (read.table, read.csv, read.delim)
  • from Excel readxl::read_excel
  • from SPSS haven::read_sav
  • from SAS haven::read_sas
  • from Stata haven::read_dta

A comprehensive tutorial can be found here.

Explore the data

# what is the dimension of the data  
dim(marathon)
[1] 488  14
# i.e. how many obs/rows
nrow(marathon)
[1] 488
# how many cols/variables?
ncol(marathon)
[1] 14
# what's the name of the variable
colnames(marathon)
 [1] "id"        "na"        "age"       "urinat3p"  "prewt"     "postwt"   
 [7] "height"    "runtime"   "trainpace" "prevmara"  "fluidint"  "waterload"
[13] "nsaid"     "sex"      
# what is the structure of the data
glimpse(marathon)
Rows: 488
Columns: 14
$ id        <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
$ na        <int> 138, 142, 151, 139, 145, 140, 142, 140, 141, 138, 141, 143, …
$ age       <dbl> 24.20534, 44.28200, 41.96304, 28.19713, 30.18207, 28.29295, …
$ urinat3p  <int> 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ prewt     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ postwt    <dbl> NA, NA, NA, NA, 50.68182, 55.68182, 59.31818, 59.77273, 64.7…
$ height    <dbl> 1.72720, NA, NA, 1.72720, NA, 1.60655, NA, NA, NA, NA, NA, N…
$ runtime   <int> NA, 161, 156, 346, 185, 233, 183, 162, 182, 190, 169, 174, 1…
$ trainpace <int> 480, 430, 360, 630, NA, NA, 435, 450, 435, 440, 410, 405, 40…
$ prevmara  <int> 3, 40, 40, 1, 3, 25, 19, 2, 80, 10, 16, 3, 2, 8, 4, 3, 120, …
$ fluidint  <int> 1, 1, 2, 1, 1, 2, 2, 3, 1, 1, 2, 2, 1, 2, 3, 1, 1, 2, 2, 1, …
$ waterload <int> 2, 2, NA, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2,…
$ nsaid     <int> 2, 2, NA, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2,…
$ sex       <int> 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, …
# show the first rows
head(marathon)
  id  na      age urinat3p prewt   postwt  height runtime trainpace prevmara
1  1 138 24.20534        2    NA       NA 1.72720      NA       480        3
2  2 142 44.28200        1    NA       NA      NA     161       430       40
3  3 151 41.96304        1    NA       NA      NA     156       360       40
4  4 139 28.19713        2    NA       NA 1.72720     346       630        1
5  5 145 30.18207        1    NA 50.68182      NA     185        NA        3
6  6 140 28.29295        1    NA 55.68182 1.60655     233        NA       25
  fluidint waterload nsaid sex
1        1         2     2   2
2        1         2     2   1
3        2        NA    NA   1
4        1         2     1   1
5        1         2     2   2
6        2         2     1   2
# quick and dirty summary
summary(marathon)
       id              na             age           urinat3p    
 Min.   :  1.0   Min.   :114.0   Min.   :19.44   Min.   :1.000  
 1st Qu.:122.8   1st Qu.:138.0   1st Qu.:31.40   1st Qu.:1.000  
 Median :244.5   Median :141.0   Median :38.80   Median :1.000  
 Mean   :244.5   Mean   :140.4   Mean   :38.85   Mean   :1.056  
 3rd Qu.:366.2   3rd Qu.:143.0   3rd Qu.:45.71   3rd Qu.:1.000  
 Max.   :488.0   Max.   :158.0   Max.   :73.08   Max.   :2.000  
                                 NA's   :2       NA's   :8      
     prewt            postwt           height         runtime     
 Min.   : 42.05   Min.   : 42.73   Min.   :1.511   Min.   :142.0  
 1st Qu.: 60.00   1st Qu.: 60.17   1st Qu.:1.676   1st Qu.:195.0  
 Median : 68.64   Median : 67.95   Median :1.727   Median :220.0  
 Mean   : 69.26   Mean   : 68.62   Mean   :1.734   Mean   :225.5  
 3rd Qu.: 75.91   3rd Qu.: 75.23   3rd Qu.:1.800   3rd Qu.:248.0  
 Max.   :101.82   Max.   :100.00   Max.   :2.108   Max.   :400.0  
 NA's   :19       NA's   :20       NA's   :18      NA's   :11     
   trainpace        prevmara          fluidint       waterload    
 Min.   :330.0   Min.   :  0.000   Min.   :1.000   Min.   :1.000  
 1st Qu.:450.0   1st Qu.:  2.000   1st Qu.:1.000   1st Qu.:1.000  
 Median :480.0   Median :  5.000   Median :1.000   Median :2.000  
 Mean   :488.8   Mean   :  8.839   Mean   :1.512   Mean   :1.744  
 3rd Qu.:525.0   3rd Qu.: 10.000   3rd Qu.:2.000   3rd Qu.:2.000  
 Max.   :780.0   Max.   :120.000   Max.   :3.000   Max.   :2.000  
 NA's   :59      NA's   :3         NA's   :4       NA's   :12     
     nsaid            sex      
 Min.   :1.000   Min.   :1.00  
 1st Qu.:1.000   1st Qu.:1.00  
 Median :2.000   Median :1.00  
 Mean   :1.543   Mean   :1.34  
 3rd Qu.:2.000   3rd Qu.:2.00  
 Max.   :2.000   Max.   :2.00  
 NA's   :11                    

Manipulate data

Data manipulations consist of a multiplicity and combinations of steps. The dplyr package in tidyverse aids this task with a collections of verbs.
I will show how to:

  1. create/modify variables with the mutate verb.
marathon <- mutate(
  marathon, 
  # code a factor variables
  sex = factor(sex, levels = c(1, 2), labels = c("Male", "Female")),
  nas135 = factor(na <= 135, labels = c("na > 135", "na ≤ 135")),
  # compute new variables
  runtime = runtime/60,
  wtdiff = postwt - prewt,
  bmi = prewt/(height^2),
  # categorize continuous variables
  bmi_cat = cut(bmi, breaks = c(15, 20, 25, 33), labels = c("< 20", "20-25", ">25")),
  wtdiff_cat = cut(wtdiff, breaks = c(-5, -2, -1, 0, 1, 2, 3, 5), 
                   include.lowest = T, right = FALSE, ordered_result = TRUE)
)
  1. select only variables of interest with the select verb.
marathon_sub1 <- select(marathon, sex, runtime, ends_with("wt"), bmi, na)
gt(head(marathon_sub1)) %>% tab_options(table.font.size = px(12))
sex runtime prewt postwt bmi na
Female NA NA NA NA 138
Male 2.683333 NA NA NA 142
Male 2.600000 NA NA NA 151
Male 5.766667 NA NA NA 139
Female 3.083333 NA 50.68182 NA 145
Female 3.883333 NA 55.68182 NA 140
  1. select only some observations with the filter verb.
women_sub2 <- filter(marathon_sub1, sex == "Female", runtime >= 4, bmi > 25)
gt(women_sub2) %>% tab_options(table.font.size = px(12))
sex runtime prewt postwt bmi na
Female 5.683333 71.27841 69.54545 27.83617 138
Female 4.000000 76.13636 75.45455 25.52154 137
Female 5.500000 80.00000 79.31818 29.34917 158
Female 5.100000 62.27273 63.40909 25.11002 133
Female 6.666667 73.63636 73.63636 31.70461 135
  1. sort data according to increasing/decreasing values of some variables with the arrange verb.
arrange(women_sub2, desc(na), runtime)
     sex  runtime    prewt   postwt      bmi  na
1 Female 5.500000 80.00000 79.31818 29.34917 158
2 Female 5.683333 71.27841 69.54545 27.83617 138
3 Female 4.000000 76.13636 75.45455 25.52154 137
4 Female 6.666667 73.63636 73.63636 31.70461 135
5 Female 5.100000 62.27273 63.40909 25.11002 133

Chaining: wrap different functions inside each other

arrange(
  filter(
    select(
      mutate(marathon, 
             sex = factor(sex, levels = c(1, 2), labels = c("Male", "Female")),
             runtime = runtime/60,
             bmi = prewt/(height^2)),
      sex, runtime, ends_with("wt"), bmi, na),
    sex == "Female", runtime >= 4, bmi > 25), desc(na), runtime)
[1] sex     runtime prewt   postwt  bmi     na     
<0 rows> (or 0-length row.names)

The %>% (pipe) operator

This operator allows you to pipe the output from one function to the input of another function.

marathon %>% 
  mutate(sex = factor(sex, levels = c(1, 2), labels = c("Male", "Female")),
         runtime = runtime/60,
         bmi = prewt/(height^2)) %>% 
  select(sex, runtime, ends_with("wt"), bmi, na) %>%
  filter(sex == "Female", runtime >= 4, bmi > 25) %>%
  arrange(desc(na), runtime)
[1] sex     runtime prewt   postwt  bmi     na     
<0 rows> (or 0-length row.names)

Summary statistics and graphs

ggplot2: grammar of graphics

ggplot(marathon, aes(wtdiff, na, col = bmi)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ sex) +
  labs(x = "Weight change, kg", y = "Sodium concentration, mmol per liter") +
  theme_bw()

Univariate descriptives

# for a continuous variable
summary(marathon$na)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  114.0   138.0   141.0   140.4   143.0   158.0 
quantile(marathon$na)
  0%  25%  50%  75% 100% 
 114  138  141  143  158 
c("Mean" = mean(marathon$na), "SD" = sd(marathon$na),
  "Median" = median(marathon$na), "IQR" = IQR(marathon$na),
  "Min" = min(marathon$na), "Max" = max(marathon$na),
  "25th percentile" = quantile(marathon$na, .25),
  "75th percentile" = quantile(marathon$na, .75))
               Mean                  SD              Median                 IQR 
         140.405738            4.752102          141.000000            5.000000 
                Min                 Max 25th percentile.25% 75th percentile.75% 
         114.000000          158.000000          138.000000          143.000000 
summarise(marathon, mean = mean(na), sd = sd(na), median = median(na), iqr = IQR(na))
      mean       sd median iqr
1 140.4057 4.752102    141   5
# histogram (counts)
ggplot(marathon, aes(na)) + 
  geom_histogram() +
  labs(x = "Serum sodium concentration (mmol/liter)")

# histogram (density)
ggplot(marathon, aes(na)) +
  geom_histogram(stat = "density", n = 2^5) +
  geom_line(stat = "density", col = "blue", size = 1.5) +
  labs(x = "Sodium concentration, mmol per liter")

# boxplot
ggplot(marathon, aes(na)) + 
  geom_boxplot() + 
  coord_flip() +
  labs(x = "Serum sodium concentration (mmol/liter)")

# violin plot
ggplot(marathon, aes(x = "", y = na)) + 
  geom_violin(fill = "lightblue") + 
  geom_jitter(size = 1) +
  labs(x = "", y = "Serum sodium concentration (mmol/liter)")

# for a categorical variable
table(marathon$nas135)

na > 135 na ≤ 135 
     426       62 
prop.table(table(marathon$nas135))

 na > 135  na ≤ 135 
0.8729508 0.1270492 
# alternative
dtab_nas135 <- count(marathon, nas135) %>% 
  mutate(
    prop = n/sum(n),
    percentage = scales::percent(prop)
  )
dtab_nas135
    nas135   n      prop percentage
1 na > 135 426 0.8729508        87%
2 na ≤ 135  62 0.1270492        13%
# barplot (counts)
ggplot(marathon, aes(nas135)) +
  geom_bar()

# barplot (proportion)
ggplot(marathon, aes(nas135)) +
  geom_bar(aes(y = ..count../sum(..count..))) +
  labs(x = "", y = "Prevalence of hyponatremia")

Bivariate descriptives (association)

# Continuous + categorical
by(marathon$na, marathon$sex, summary)
marathon$sex: Male
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    118     139     141     141     144     156 
------------------------------------------------------------ 
marathon$sex: Female
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  114.0   136.0   139.0   139.2   142.8   158.0 
group_by(marathon, sex) %>% 
  summarise(n_nomiss = sum(!is.na(na)), 
            mean = mean(na), 
            median = median(na),
            sd = sd(na), 
            iqr = IQR(na), 
            min = min(na), 
            max = max(na),
            p25 = quantile(na, .25),
            p75 = quantile(na, .75))
# A tibble: 2 × 10
  sex    n_nomiss  mean median    sd   iqr   min   max   p25   p75
  <fct>     <int> <dbl>  <dbl> <dbl> <dbl> <int> <int> <dbl> <dbl>
1 Male        322  141.    141  4.15  5      118   156   139  144 
2 Female      166  139.    139  5.55  6.75   114   158   136  143.
ggplot(marathon, aes(sex, na)) +
  geom_boxplot() + 
  labs(x = "", y = "Serum sodium concentration (mmol/liter)")

ggplot(marathon, aes(na, col = sex)) +
  geom_line(stat = "density") + 
  labs(x = "Serum sodium concentration (mmol/liter)", y = "Density")

# statistical tests
test_nasex <- t.test(na ~ sex, data = marathon, var.equal = TRUE)
test_nasex

    Two Sample t-test

data:  na by sex
t = 4.1777, df = 486, p-value = 3.492e-05
alternative hypothesis: true difference in means between group Male and group Female is not equal to 0
95 percent confidence interval:
 0.9882093 2.7431385
sample estimates:
  mean in group Male mean in group Female 
            141.0404             139.1747 
broom::tidy(test_nasex)
# A tibble: 1 × 10
  estim…¹ estim…² estim…³ stati…⁴ p.value param…⁵ conf.…⁶ conf.…⁷ method alter…⁸
    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <chr>  <chr>  
1    1.87    141.    139.    4.18 3.49e-5     486   0.988    2.74 Two S… two.si…
# … with abbreviated variable names ¹​estimate, ²​estimate1, ³​estimate2,
#   ⁴​statistic, ⁵​parameter, ⁶​conf.low, ⁷​conf.high, ⁸​alternative
wilcox.test(na ~ sex, data = marathon)

    Wilcoxon rank sum test with continuity correction

data:  na by sex
W = 33002, p-value = 1.996e-05
alternative hypothesis: true location shift is not equal to 0
# frequencies table
tab_nabmi <- table(bmi_cat = marathon$bmi_cat, nas135 = marathon$nas135)
tab_nabmi
       nas135
bmi_cat na > 135 na ≤ 135
  < 20        34       15
  20-25      294       32
  >25         77       13
prop.table(tab_nabmi, margin = 1)
       nas135
bmi_cat   na > 135   na ≤ 135
  < 20  0.69387755 0.30612245
  20-25 0.90184049 0.09815951
  >25   0.85555556 0.14444444
# customizable tables
Epi::stat.table(list(nas135, bmi_cat), 
           list(count(), percent(nas135)), 
           marathon, margin = T)
 ------------------------------------------- 
           -------------bmi_cat------------- 
 nas135        < 20   20-25     >25   Total  
 ------------------------------------------- 
 na > 135        34     294      77     426  
               69.4    90.2    85.6    87.3  
                                             
 na ≤ 135        15      32      13      62  
               30.6     9.8    14.4    12.7  
                                             
                                             
 Total           49     326      90     488  
              100.0   100.0   100.0   100.0  
 ------------------------------------------- 
# bar plot
group_by(marathon, bmi_cat) %>%
  filter(!is.na(bmi_cat)) %>% 
  count(nas135) %>%
  mutate(perc = n/sum(n)) %>%
  ggplot(aes(x = nas135, y = perc, fill = bmi_cat)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(x = "", y = "Frequency", fill = "Categories of BMI")

# statistical tests
chisq.test(tab_nabmi)

    Pearson's Chi-squared test

data:  tab_nabmi
X-squared = 16.629, df = 2, p-value = 0.000245
with(marathon, cor(na, wtdiff,  method = "pearson", use = "complete.obs"))
[1] -0.4106029
p_nawtdiff <- ggplot(marathon, aes(wtdiff, na)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(x = "Weight change (kg) pre/post race", y = "Serum sodium concentration (mmol/liter)")
ggExtra::ggMarginal(p_nawtdiff, type = "histogram")

Table 1

# Table 1
tab1 <- arsenal::tableby(nas135 ~ age + sex + wtdiff + bmi_cat + runtime, data = marathon)
summary(tab1)
na > 135 (N=426) na ≤ 135 (N=62) Total (N=488) p value
age 0.519
   N-Miss 2 0 2
   Mean (SD) 38.955 (9.402) 38.129 (9.474) 38.849 (9.406)
   Range 19.436 - 73.079 23.206 - 60.244 19.436 - 73.079
sex < 0.001
   Male 297 (69.7%) 25 (40.3%) 322 (66.0%)
   Female 129 (30.3%) 37 (59.7%) 166 (34.0%)
wtdiff < 0.001
   N-Miss 28 5 33
   Mean (SD) -0.892 (1.541) 0.725 (1.441) -0.689 (1.619)
   Range -7.045 - 4.091 -2.500 - 3.409 -7.045 - 4.091
bmi_cat < 0.001
   N-Miss 21 2 23
   < 20 34 (8.4%) 15 (25.0%) 49 (10.5%)
   20-25 294 (72.6%) 32 (53.3%) 326 (70.1%)
   >25 77 (19.0%) 13 (21.7%) 90 (19.4%)
runtime < 0.001
   N-Miss 9 2 11
   Mean (SD) 3.695 (0.656) 4.202 (0.785) 3.759 (0.693)
   Range 2.367 - 6.000 2.650 - 6.667 2.367 - 6.667

Statistical models

Linear regression

E[Y|X] = \beta_0 + \beta_1X_1 + \dots + \beta_kX_k

# univariate linear regression
fit1 <- lm(na ~ sex, data = marathon)
summary(fit1)

Call:
lm(formula = na ~ sex, data = marathon)

Residuals:
     Min       1Q   Median       3Q      Max 
-25.1747  -2.1747  -0.0404   2.9596  18.8253 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 141.0404     0.2605 541.503  < 2e-16 ***
sexFemale    -1.8657     0.4466  -4.178 3.49e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.674 on 486 degrees of freedom
Multiple R-squared:  0.03467,   Adjusted R-squared:  0.03268 
F-statistic: 17.45 on 1 and 486 DF,  p-value: 3.492e-05
tidy(fit1)
# A tibble: 2 × 5
  term        estimate std.error statistic   p.value
  <chr>          <dbl>     <dbl>     <dbl>     <dbl>
1 (Intercept)   141.       0.260    542.   0        
2 sexFemale      -1.87     0.447     -4.18 0.0000349
# multivariable linear regression
fit2 <- lm(na ~ wtdiff + sex, data = marathon)
ci.lin(fit2)
               Estimate    StdErr          z            P       2.5%
(Intercept) 139.8787975 0.2849913 490.817817 0.000000e+00 139.320225
wtdiff       -1.1572580 0.1310845  -8.828337 1.062437e-18  -1.414179
sexFemale    -0.8158008 0.4452803  -1.832106 6.693557e-02  -1.688534
                   97.5%
(Intercept) 140.43737013
wtdiff       -0.90033715
sexFemale     0.05693247
marathon <- mutate(marathon,
                   pred_fit2 = predict(fit2, newdata = marathon))
# multivariable linear regression with interaction
fit3 <- lm(na ~ wtdiff*sex, data = marathon)
ci.lin(fit3, ctr.mat = rbind(c(0, 1, 0, 0),
                             c(0, 1, 0, 1)))
      Estimate    StdErr         z            P      2.5%      97.5%
[1,] -1.076637 0.1543198 -6.976661 3.022779e-12 -1.379098 -0.7741756
[2,] -1.366193 0.2484292 -5.499325 3.812468e-08 -1.853105 -0.8792807
marathon <- mutate(marathon,
                   pred_fit3 = predict(fit3, newdata = marathon))
# graphical comparison
gridExtra::grid.arrange(
  ggplot(marathon, aes(wtdiff, pred_fit2, col = sex)) +
    geom_line() +
    labs(x = "Weight change, kg", y = "Predicted mean na"),
    ggplot(marathon, aes(wtdiff, pred_fit3, col = sex)) +
    geom_line() +
    labs(x = "Weight change, kg", y = "Predicted mean na"),
  ncol = 2
)

Logistic regression

\log(\textrm{odds}(Y|X)) = \beta_0 + \beta_1X_1 + \dots + \beta_kX_k

# univariate logistic regression
fit4 <- glm(nas135 ~ wtdiff, data = marathon, family = binomial)
ci.exp(fit4)
            exp(Est.)      2.5%     97.5%
(Intercept) 0.1518239 0.1112437 0.2072074
wtdiff      2.0717812 1.6689660 2.5718185
marathon <- marathon %>% 
  mutate(pred_p = predict(fit4, newdata = marathon, type = "response"),
         pred_odds = predict(fit4, newdata = marathon, type = "link")) 

grid.arrange(
  ggplot(marathon, aes(wtdiff, pred_p)) +
    geom_line() +
    labs(x = "Weight change, kg", y = "Predicted probability"),
  ggplot(marathon, aes(wtdiff, pred_odds)) +
    geom_line() +
    labs(x = "Weight change, kg", y = "Predicted odds"),
  ncol = 2
)

# multivariable logistic regression
fit5 <- glm(nas135 ~ wtdiff + sex, data = marathon, family = binomial)
ci.exp(fit5)
            exp(Est.)       2.5%     97.5%
(Intercept) 0.1001537 0.06323444 0.1586281
wtdiff      2.0189387 1.61520282 2.5235924
sexFemale   2.3858241 1.29710329 4.3883605
mutate(marathon, pred = predict(fit5, newdata = marathon, type = "response")) %>% 
  ggplot(aes(wtdiff, pred, col = sex)) +
  geom_line() +
  labs(x = "Weight change, kg", y = "Predicted probability")